Retweeting tweets from ESTRO37

The current new hotness for retweeting and manipulating twitter data in the R world is rtweet, which is what we’ll use today.

The basic outline of code and analysis will be a mishmash of Mike Kearney’s Ruser2018 analysis and my old NIPS2016 analysis. Kearney’s analysis is quite good, I recommend it for anyone that is interested in methodology.

The network vis also samples from a notebook from a cool biologist Kenneth Turner, check out his notebook

He (Kearney)’s the author of rtweet, so often there’s a bit more of the details sprinkled throughout the analysis.

Before beginning, I’ll sprinkle in some of the highly retweets from the first couple days.

First, a good tweet:

Then, a curious tweet (well, I think conferences that still try to lock down taking pictures are fighting against a river at this point):

library(rtweet)

searchfield <-c("ESTRO37")

if (file.exists(file.path("data", "search.rds"))) {
  since_id <- readRDS(file.path("data", "search.rds"))
  since_id <- since_id$status_id[1]
} else {
  since_id <- NULL
}

## search for up to 100,000 tweets mentionging rstudio::conf
rt <- search_tweets(
  paste(searchfield, collapse = " OR "),
  n = 1e5, verbose = FALSE,
  since_id = since_id,
  retryonratelimit = TRUE
)

## if there's already a search data file saved, then read it in,
## drop the duplicates, and then update the `rt` data object
if (file.exists(file.path("data", "search.rds"))) {

  ## bind rows (for tweets AND users data)
  rt <- do_call_rbind(
    list(rt, readRDS(file.path("data", "search.rds"))))

  ## determine whether each observation has a unique status ID
  kp <- !duplicated(rt$status_id)

  ## only keep rows (observations) with unique status IDs
  users <- users_data(rt)[kp, ]

  ## the rows of users should correspond with the tweets
  rt <- rt[kp, ]

  ## restore as users attribute
  attr(rt, "users") <- users
}

## save the data
saveRDS(rt, file.path("data", "search.rds"))

## save shareable data (only status_ids)
saveRDS(rt[, "status_id"], file.path("data", "search-ids.rds"))

Initial vis

Time series of the data in two hour chunks. One REALLY nice thing about rtweet is that it makes plotting the timeseries of tweet a completely lazy-person’s function, ts_plot, where you can feed it the aggregation time to summarize over. Here, we go with 2 hour’s as that seems like a good medium to begin with.

suppressPackageStartupMessages(library(tidyverse))
library(cowplot)
## 
## Attaching package: 'cowplot'
## The following object is masked from 'package:ggplot2':
## 
##     ggsave
rt %>%
  filter(created_at > "2018-01-29") %>%
  ts_plot("2 hours", color = "transparent") +
  geom_smooth(method = "loess", se = FALSE, span = .1,
  size = 2, colour = "#0066aa") +
  geom_point(size = 5,
    shape = 21, fill = "#ADFF2F99", colour = "#000000dd") +
  theme(axis.text = element_text(colour = "#222222"),
        text=element_text('Roboto Condensed'),
    plot.title = element_text(size = rel(1.7), face = "bold"),
    plot.subtitle = element_text(size = rel(1.3)),
    plot.caption = element_text(colour = "#444444")) +
  labs(title = "Frequency of tweets about ESTRO37 over time",
    subtitle = "Twitter status counts aggregated using two-hour intervals",
    caption = "\n\nSource: Data gathered via Twitter's standard `search/tweets` API using rtweet",
    x = NULL, y = NULL)
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : span too small. fewer data values than degrees of freedom.
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : pseudoinverse used at 1.5242e+09
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : neighborhood radius 15552
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : reciprocal condition number 0
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : There are other near singularities as well. 2.4186e+08

Sentiment analysis

Again, another analysis I’ve done in the past, but here I’ll use Mike Kearney’s version simply to make my life a bit easier as it’s mapping on to rtweet’s data, and I haven’t actually used the syuzhet, which is a commonly used sentiment analysis package.

## clean up the text a bit (rm mentions and links)
rt$text2 <- gsub(
  "^RT:?\\s{0,}|#|@\\S+|https?[[:graph:]]+", "", rt$text)
## convert to lower case
rt$text2 <- tolower(rt$text2)
## trim extra white space
rt$text2 <- gsub("^\\s{1,}|\\s{1,}$", "", rt$text2)
rt$text2 <- gsub("\\s{2,}", " ", rt$text2)

## estimate pos/neg sentiment for each tweet
rt$sentiment <- syuzhet::get_sentiment(rt$text2, "syuzhet")

## write function to round time into rounded var
round_time <- function(x, sec) {
  as.POSIXct(hms::hms(as.numeric(x) %/% sec * sec))
}

## plot by specified time interval (1-hours)
rt %>%
  mutate(time = round_time(created_at, 60 * 60)) %>%
  group_by(time) %>%
  summarise(sentiment = mean(sentiment, na.rm = TRUE)) %>%
  mutate(valence = ifelse(sentiment > 0L, "Positive", "Negative")) %>%
  ggplot(aes(x = time, y = sentiment)) +
  geom_smooth(method = "loess", span = .1,
    colour = "#aa11aadd", fill = "#bbbbbb11") +
  geom_point(aes(fill = valence, colour = valence), 
    shape = 21, alpha = .6, size = 3.5) +
  theme(legend.position = "none",
        text=element_text(family='Roboto Condensed'),
    axis.text = element_text(colour = "#222222"),
    plot.title = element_text(size = rel(1.7), face = "bold"),
    plot.subtitle = element_text(size = rel(1.3)),
    plot.caption = element_text(colour = "#444444")) +
  scale_fill_manual(
    values = c(Positive = "#2244ee", Negative = "#dd2222")) +
  scale_colour_manual(
    values = c(Positive = "#001155", Negative = "#550000")) +
  labs(x = NULL, y = NULL,
    title = "Sentiment (valence) of ESTRO37 tweets over time",
    subtitle = "Mean sentiment of tweets aggregated in one-hour intervals",
    caption = "\nSource: Data gathered using rtweet. Sentiment analysis done using syuzhet")

Tweet busters

So… Who are the top ranking tweeps currently?

showvals=rt %>% select(favorite_count,retweet_count,screen_name,name) %>%
  group_by(screen_name,name) %>%
  summarise(fav_count=sum(favorite_count),
            rt_count=sum(retweet_count),
            n=n()) %>% arrange(-n)

knitr::kable(showvals[1:40,])
screen_name name fav_count rt_count n
FuenteApolo Castalia 205 1149 270
roentgen66 Virginia Ruiz 0 989 224
JoaquinJCabrera Joaquín J Cabrera 135 514 122
davidbermudezi Ramiro Bermudez-I.MD 0 696 121
cd_fuller dave fuller 3 463 93
BreastDocUK Richard Simcock 390 493 81
anitaodonovan1 Anita O’Donovan 260 390 73
LuisAlberto3P Dr. Luis A. Pérez-Romasanta 90 431 71
AmadeoWals Amadeo Wals 35 251 63
Monthy_A Angel Montero Luis 34 371 59
MsConcu Tere M. Migueláñez 76 187 53
subatomicdoc Matthew Katz, MD 62 314 51
p_blancha Pierre Blanchard, MD 215 340 50
CancerGeek CancerGeek 139 216 47
Dr_ASalem Ahmed Salem 28 167 44
RT_physics RTphysics Manchester 221 173 43
Rad_Nation Radiation Nation 106 284 37
FREELANCEROG FROG 37 173 36
Accuray Accuray Incorporated 51 106 33
adelapoite Adela 0 187 33
gerryhanna Gerry Hanna 187 166 30
JulieMcCrossin Julie McCrossin 60 243 30
simongoldswort1 Simon Goldsworthy 74 141 30
LionelREICHARDT PHARMAGEEK 68 125 29
cancerphysicist Ane Appelt 179 137 28
achoud72 Dr Ananya 33 116 26
syeepei YeePei Song 67 45 26
christian_roenn Chr Rønn Hansen 142 56 25
ESTRO_RT ESTRO 296 121 25
ClinOncologist David Woolf 116 106 23
MPHRadiotherapy MPHRadiotherapy 0 122 23
MichRadioactiva MiVQ 0 161 22
TovarAris María Isabel Tovar 0 173 22
WarrenBacorro Warren Bacorro 24 121 21
antheasaif Anthea Cree 20 67 19
lucindamorris23 Lucinda Morris 110 165 18
Elekta Elekta 80 37 17
finn_corinne Corinne Faivre-Finn 98 71 17
FYOncologist FYO 55 68 17
sandraturner49 Sandra Turner 54 182 17
# Includes both tweets and rtweets
showvals[1:40,]  %>%
  transform(screen_name = reorder(screen_name, n)) %>% 
  ggplot(aes(screen_name, n))+ geom_bar(stat = "identity") + 
  coord_flip() +
  theme_bw() + theme(axis.text.x = element_text(angle = 45, hjust = 1))+
  labs(x=NULL,y=NULL,
       title="Top 40 tweeters of ESTRO37")

showvals=rt %>% filter(is_retweet==FALSE) %>%
  select(favorite_count,retweet_count,screen_name,name) %>%
  group_by(screen_name,name) %>%
  summarise(fav_count=sum(favorite_count),
            rt_count=sum(retweet_count),
            n=n()) %>% arrange(-n)

showvals[1:40,] %>%
 transform(screen_name = reorder(screen_name, n)) %>% 
  ggplot(aes(screen_name, n))+ geom_bar(stat = "identity") + 
  coord_flip() +
  theme_bw() + theme(axis.text.x = element_text(angle = 45, hjust = 1))+
  labs(x=NULL,y=NULL,
       title="Top 40 retweeters of ESTRO37")

Influence (dubious)

Here is that somewhat (very) dumb influence metric I cooked up or adapted from elsewhere, I can’t quite remember at this point. Either way I don’t put much value in it. It’s basically just the sum of favorites and retweets

library(viridis)
## Loading required package: viridisLite
showvals2= showvals %>% mutate(impact = fav_count + rt_count) %>%
  arrange(-impact)

showvals2[1:40,] %>%
  transform(screen_name = reorder(screen_name, impact)) %>%
  ggplot(aes(screen_name, impact, fill = impact / n)) +
  geom_bar(stat = "identity") +
  coord_flip()+ ylab('Impact (numFavorites + numRetweets)') +
  theme_bw() + theme(axis.text.x = element_text(angle = 45, hjust = 1)) + 
  scale_fill_viridis(trans = "log", breaks = c(1, 5, 10, 50))

# Still using the hated word cloud

The word cloud gets a bad rap, I mean, it IS basically impossible to intrepret in any numerical or comparative sense. But I do still find it useful to get a quick overview of just what types of things people are talking about. And thus, wordclouds we go.

library(tidytext)
library(RColorBrewer)

tidy_df = rt %>% unnest_tokens(word,text2)

tw_stop<-data.frame(word=c("ESTRO37","estro37","rtt","n","24","30","1300" ,lexicon='whatevs'))
stop_words=filter(stopwordslangs,(lang=='en' | lang=="es")  & p >.9999) %>% pull(word)
stop_words=tibble(word=stop_words)

tidy_cloud <- tidy_df %>%
 anti_join(tw_stop) %>%
  anti_join(stop_words)
## Joining, by = "word"
## Warning: Column `word` joining character vector and factor, coercing into
## character vector
## Joining, by = "word"
hm=tidy_cloud %>%
 count(word)
library(wordcloud)
wordcloud(hm$word, hm$n,max.words = 150,colors=brewer.pal(8,'Dark2'),random.order=FALSE,random.color=FALSE)
## Warning in wordcloud(hm$word, hm$n, max.words = 150, colors =
## brewer.pal(8, : leadership could not be fit on page. It will not be
## plotted.
## Warning in wordcloud(hm$word, hm$n, max.words = 150, colors =
## brewer.pal(8, : mundial could not be fit on page. It will not be plotted.
## Warning in wordcloud(hm$word, hm$n, max.words = 150, colors =
## brewer.pal(8, : hypoxia could not be fit on page. It will not be plotted.
## Warning in wordcloud(hm$word, hm$n, max.words = 150, colors =
## brewer.pal(8, : impact could not be fit on page. It will not be plotted.
## Warning in wordcloud(hm$word, hm$n, max.words = 150, colors =
## brewer.pal(8, : satisfaction could not be fit on page. It will not be
## plotted.
## Warning in wordcloud(hm$word, hm$n, max.words = 150, colors =
## brewer.pal(8, : bcsm could not be fit on page. It will not be plotted.
## Warning in wordcloud(hm$word, hm$n, max.words = 150, colors =
## brewer.pal(8, : improving could not be fit on page. It will not be plotted.
## Warning in wordcloud(hm$word, hm$n, max.words = 150, colors =
## brewer.pal(8, : lecture could not be fit on page. It will not be plotted.
## Warning in wordcloud(hm$word, hm$n, max.words = 150, colors =
## brewer.pal(8, : present could not be fit on page. It will not be plotted.
## Warning in wordcloud(hm$word, hm$n, max.words = 150, colors =
## brewer.pal(8, : science could not be fit on page. It will not be plotted.
## Warning in wordcloud(hm$word, hm$n, max.words = 150, colors =
## brewer.pal(8, : inmunoonc could not be fit on page. It will not be plotted.
## Warning in wordcloud(hm$word, hm$n, max.words = 150, colors =
## brewer.pal(8, : issues could not be fit on page. It will not be plotted.
## Warning in wordcloud(hm$word, hm$n, max.words = 150, colors =
## brewer.pal(8, : practice could not be fit on page. It will not be plotted.
## Warning in wordcloud(hm$word, hm$n, max.words = 150, colors =
## brewer.pal(8, : proud could not be fit on page. It will not be plotted.

Finally, the network

OK, after all of the, what you actually really cared about was the network, right?

 library(igraph) 
## 
## Attaching package: 'igraph'
## The following objects are masked from 'package:dplyr':
## 
##     as_data_frame, groups, union
## The following objects are masked from 'package:purrr':
## 
##     compose, simplify
## The following object is masked from 'package:tidyr':
## 
##     crossing
## The following object is masked from 'package:tibble':
## 
##     as_data_frame
## The following objects are masked from 'package:stats':
## 
##     decompose, spectrum
## The following object is masked from 'package:base':
## 
##     union
library(hrbrthemes)
## NOTE: Either Arial Narrow or Roboto Condensed fonts are *required* to use these themes.
##       Please use hrbrthemes::import_roboto_condensed() to install Roboto Condensed and
##       if Arial Narrow is not on your system, please see http://bit.ly/arialnarrow
m_g <- rt %>%
select(screen_name, mentions_screen_name) %>%
  
  unnest(mentions_screen_name) %>% 
  
  filter(!is.na(mentions_screen_name)) %>%
  
  group_by(screen_name,mentions_screen_name) %>%
  summarise(weight=n()) %>%
  
  graph_from_data_frame()

library(igraph)
library(tidygraph)
## 
## Attaching package: 'tidygraph'
## The following object is masked from 'package:igraph':
## 
##     groups
## The following object is masked from 'package:stats':
## 
##     filter
library(ggraph)

  
  

m_graph <- m_g
dfv=data.frame(V=as.vector(V(m_graph)),screen_name=V(m_graph)$name,degree(m_graph))
names(dfv)[3]="degree"
dfv=cbind(dfv,quantile=cut(dfv$degree,breaks=quantile(dfv$degree,probs=c(0,.95,1)),labels=c("Bottom99",'Top1'),include.lowest=T))
dfv$quantile=as.character(dfv$quantile)
library(dplyr)
dfv2=arrange(dfv,desc(quantile))
dfv3=dfv2[dfv2$quantile=='Top1',]
print(nrow(dfv3))
## [1] 52
red_gr=induced_subgraph(m_graph,dfv3$V)

#g=as_tbl_graph(red_gr) %>%
#  mutate(pop=centrality_pagerank())

#ggraph(g,layout='kk')+
#  geom_edge_fan(aes(alpha=..index..),show.legend=FALSE) +
#  geom_node_point(aes(size=pop),show.legend=FALSE) +geom_node_label(aes(label=name))+theme_graph()


V(red_gr)$node_label <- unname(ifelse(degree(red_gr)[V(red_gr)] > 20, names(V(red_gr)), "")) 

V(red_gr)$node_size <- unname(ifelse(degree(red_gr)[V(red_gr)] > 20, degree(red_gr), 0)) 



ggraph(red_gr, layout = 'linear', circular = TRUE) + 
  
  geom_edge_arc(edge_width=0.125, aes(alpha=..index..)) +
  
  geom_node_label(aes(label=node_label, size=node_size),
                  
                  label.size=0, fill="#ffffff66", segment.colour="springgreen",
                  
                  color="slateblue", repel=TRUE, family=font_rc, fontface="bold") +
  
  coord_fixed() +
  
  scale_size_area(trans="sqrt") +
  
  labs(title="Mention Relationships", subtitle="Most mentioned screen names labeled. Darkers edges == more retweets. Node size == larger degree") +
  
  theme_graph(base_family=font_rc) +
  
  theme(legend.position="none")

# retweet analysis

rt_g=filter(rt, retweet_count > 0) %>% 
  
  select(screen_name, retweet_screen_name) %>%
  
  filter(!is.na(retweet_screen_name)) %>% 
  
  graph_from_data_frame() 


dfv=data.frame(V=as.vector(V(rt_g)),screen_name=V(rt_g)$name,degree(rt_g))
names(dfv)[3]="degree"
dfv=cbind(dfv,quantile=cut(dfv$degree,breaks=quantile(dfv$degree,probs=c(0,.9,1)),labels=c("Bottom99",'Top1'),include.lowest=T))
dfv$quantile=as.character(dfv$quantile)
library(dplyr)
dfv2=arrange(dfv,desc(quantile))
dfv3=dfv2[dfv2$quantile=='Top1',]
print(nrow(dfv3))
## [1] 88
ndf <- rt %>% filter(screen_name %in% dfv3$screen_name)
nrow(ndf %>% filter(!is.na(retweet_screen_name)))
## [1] 1600
nrow(ndf %>% filter(is.na(retweet_screen_name)))
## [1] 874
red_gr_rt=induced_subgraph(rt_g,dfv3$V)



V(red_gr_rt)$node_label <- unname(ifelse(degree(red_gr_rt)[V(red_gr_rt)] > 25, names(V(red_gr_rt)), "")) 

V(red_gr_rt)$node_size <- unname(ifelse(degree(red_gr_rt)[V(red_gr_rt)] > 25, degree(red_gr_rt), 0)) 



ggraph(red_gr_rt, layout = 'linear', circular = TRUE) + 
  
  geom_edge_arc(edge_width=0.125, aes(alpha=..index..)) +
  
  geom_node_label(aes(label=node_label, size=node_size),
                  
                  label.size=0, fill="#ffffff66", segment.colour="springgreen",
                  
                  color="slateblue", repel=TRUE, family=font_rc, fontface="bold") +
  
  coord_fixed() +
  
  scale_size_area(trans="sqrt") +
  
  labs(title="Retweet Relationships", subtitle="Most retweeted screen names labeled. Darkers edges == more retweets. Node size == larger degree") +
  
  theme_graph(base_family=font_rc) +
  
  theme(legend.position="none")